--- title: Audio Datasets keywords: fastai sidebar: home_sidebar summary: "This module defines datasets to use with audio data from Deep Learning applications" description: "This module defines datasets to use with audio data from Deep Learning applications" nb_path: "nbs/01audio_dataset.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

The data structure for the audio labels is defined by a dataframe with the columns (id, label, tmin, tmax, fmin, fmax) (see example bellow). Each row defines a bounding box of time and frequency for some label in the corresponding audio clip. Multiple boxes may exist for the same audio id.

id label tmin tmax fmin fmax ...
a239gfdda 10 2.4 4.1 5000 10000 ...
b94k2g0as 4 23.7 40.3 2500 7000 ...
{% raw %}

class Datasets[source]

Datasets(items=None, tfms=None, tls=None, n_inp=None, dl_type=None, use_list=False, match=None) :: FilteredBase

A dataset that creates a tuple from each tfms, passed through item_tfms

{% endraw %} {% raw %}

class DataLoader[source]

DataLoader(dataset=None, bs=None, num_workers=0, pin_memory=False, timeout=0, batch_size=None, shuffle=False, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, wif=None, before_iter=None, after_item=None, before_batch=None, after_batch=None, after_iter=None, create_batches=None, create_item=None, create_batch=None, retain=None, get_idxs=None, sample=None, shuffle_fn=None, do_batch=None) :: GetAttr

API compatible with PyTorch DataLoader, with a lot more callbacks and flexibility

{% endraw %} {% raw %}

class DataLoaders[source]

DataLoaders(*loaders, path='.', device=None) :: GetAttr

Basic wrapper around several DataLoaders.

{% endraw %} {% raw %}

class RenameColumns[source]

RenameColumns(id='id', label='label', tmin='tmin', tmax='tmax', fmin='fmin', fmax='fmax') :: Transform

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

{% endraw %} {% raw %}

load_dataframe[source]

load_dataframe(file)

{% endraw %} {% raw %}

group_labels[source]

group_labels(df)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
%%time
path = Path('/kaggle/kaggle_rainforest_audio/data')
rename_cols = RenameColumns(id='recording_id', label='species_id', tmin='t_min', 
                            tmax='t_max',fmin='f_min', fmax='f_max')
df = Pipeline([load_dataframe, rename_cols, group_labels])(path/'train_tp.csv')
df.head()
CPU times: user 127 ms, sys: 480 µs, total: 127 ms
Wall time: 127 ms
id label songtype_id tmin fmin tmax fmax
0 003bec244 [14] [1] [44.544] [2531.25] [45.1307] [5531.25]
1 006ab765f [23] [1] [39.9615] [7235.16] [46.0452] [11283.4]
2 007f87ba2 [12] [1] [39.135999999999996] [562.5] [42.272] [3281.25]
3 0099c367b [17] [4] [51.4206] [1464.26] [55.1996] [4565.04]
4 009b760e6 [10] [1] [50.0854] [947.461] [52.5293] [10852.7]
{% endraw %} {% raw %}

time2pix_image[source]

time2pix_image(t, clip_duration, image_width)

{% endraw %} {% raw %}

time2pix_wave[source]

time2pix_wave(t, clip_duration, n_samples)

{% endraw %} {% raw %}

pix2time[source]

pix2time(pix, clip_duration, image_width)

{% endraw %} {% raw %}

pix2pix_image[source]

pix2pix_image(pix, image_width, n_samples)

{% endraw %} {% raw %}

time_labels[source]

time_labels(lbl, tmin, tmax, fmin, fmax, sample_rate, clip_duration, image_width, n_mels)

{% endraw %} {% raw %}

audio_crop[source]

audio_crop(wav, df_row, sample_rate, hop_length=512, n_mels=128, tile_width=256)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
%%time
sample_rate, hop_length, n_mels, tile_width = 32000, 512, 128, 256
i = 15
wav = load_npy(path/'npy32000'/'train'/f'{df.loc[i].id}.npy')
wav, label = audio_crop(wav, df.loc[i], sample_rate=sample_rate, tile_width=tile_width)
plt.imshow(melspectrogram(wav, sample_rate)[0], cmap='RdYlGn_r')
plt.imshow(label, alpha=0.5, cmap='jet')
plt.show()
CPU times: user 475 ms, sys: 15.5 ms, total: 490 ms
Wall time: 310 ms
{% endraw %} {% raw %}

reorganize_batch[source]

reorganize_batch(o)

{% endraw %} {% raw %}

create_dataset_item[source]

create_dataset_item(df_row, sample_rate, path=Path('.'), tile_width=256, hop_length=512, n_mels=128)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
%%time
path = Path('/kaggle/kaggle_rainforest_audio/data')
rename_cols = RenameColumns(id='recording_id', label='species_id', tmin='t_min', 
                            tmax='t_max',fmin='f_min', fmax='f_max')
df = Pipeline([load_dataframe, rename_cols, group_labels])(path/'train_tp.csv')
data = Datasets(items=df, tfms=partial(create_dataset_item, path=path, sample_rate=32000, 
                                       tile_width=256))
dls = DataLoader(data, bs=64, do_batch=reorganize_batch)
xb, yb = dls.one_batch()

wav, label = data[15][0][0], data[15][0][1]
plt.imshow(melspectrogram(wav, sample_rate)[0], cmap='RdYlGn_r')
plt.imshow(label, alpha=0.5, cmap='jet')
plt.show()
CPU times: user 763 ms, sys: 90.9 ms, total: 854 ms
Wall time: 629 ms
{% endraw %} {% raw %}

apply_augmentations[source]

apply_augmentations(o, augs_pipeline=<lambda>)

{% endraw %} {% raw %}

audio_augment[source]

audio_augment(sample_rate, p=0.5, gaussianSNR=True, gain=True, clipping=True, pitchshift=True, timestretch=True, freq_mask=True, time_mask=True, extra_augs=[])

{% endraw %} {% raw %}
{% endraw %} {% raw %}
%%time
path = Path('/kaggle/kaggle_rainforest_audio/data')
rename_cols = RenameColumns(id='recording_id', label='species_id', tmin='t_min', 
                            tmax='t_max',fmin='f_min', fmax='f_max')
df = Pipeline([load_dataframe, rename_cols, group_labels])(path/'train_tp.csv')
data = Datasets(items=df, tfms=partial(create_dataset_item, path=path, sample_rate=32000, 
                                       tile_width=256))
dls = DataLoader(data, bs=64, do_batch=reorganize_batch,             
                 after_item=partial(apply_augmentations, 
                 augs_pipeline=audio_augment(sample_rate, p=0.25)),
                 after_batch=MelSpectrogram(sample_rate))
xb, yb = dls.one_batch()
img, label = xb[15], yb[15]
plt.imshow(img[0], cmap='RdYlGn_r')
plt.imshow(label, alpha=0.5, cmap='jet')
plt.show()
CPU times: user 4.45 s, sys: 415 ms, total: 4.86 s
Wall time: 4.77 s
{% endraw %} {% raw %}

show_augmentations[source]

show_augmentations(data, dls, sample_rate, n=10, cmap='RdYlGn_r', vmin=-2, vmax=4)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
%%time
show_augmentations(data, dls, sample_rate=32000)
CPU times: user 13.7 s, sys: 565 ms, total: 14.2 s
Wall time: 11.8 s
{% endraw %}